import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#Supress warnings
import warnings
warnings.filterwarnings('ignore')
#Reading CSV data file
concrete=pd.read_csv("concrete.csv")
concrete.head()
#all features are float or type int and donot have missing data
concrete.info()
#no missing values
concrete.isnull().sum()
#There are many zeros in 3 features, believe those are the missing value and have been entered as 0.
#Need to update the same
print(concrete[concrete==0].count(axis=0))
print("\nPercentage of zeros\n")
print(round(concrete[concrete==0].count(axis=0)/len(concrete.index)*100,2))
#replacing zeros with NaN's
#concrete.replace(0,np.nan,inplace=True)
#checking for zeros now
#print("Zeros now in data")
#print(concrete[concrete==0].count(axis=0))
#checking missing now, need to look at them. First lets study features.
#print("\nNow missing values in data")
#print(concrete.isnull().sum())
#diff between mean and 50% is normal in most variable except age which is as expected
#cemet, slag,water, superplastic, fineagg, strength have tail towards right
#ash, coarseagg have tail towards left
#age has very lonf tail towards right
concrete.describe()
#kdeplot
plt.figure(figsize=(15,15))
plt.subplot(3,3,1)
sns.kdeplot(concrete['cement'])
plt.subplot(3,3,2)
sns.kdeplot(concrete['slag'])
plt.subplot(3,3,3)
sns.kdeplot(concrete['ash'])
plt.subplot(3,3,4)
sns.kdeplot(concrete['water'])
plt.subplot(3,3,5)
sns.kdeplot(concrete['superplastic'])
plt.subplot(3,3,6)
sns.kdeplot(concrete['coarseagg'])
plt.subplot(3,3,7)
sns.kdeplot(concrete['fineagg'])
plt.subplot(3,3,8)
sns.kdeplot(concrete['age'])
plt.subplot(3,3,9)
sns.kdeplot(concrete['strength'])
#Data is multimodal. Age and superplastic have tail at right end.will have a better picture on data
#after vieweing interaction between 2 variables.
#boxplot
plt.figure(figsize=(15,15))
plt.subplot(3,3,1)
sns.boxplot(concrete['cement'])
plt.subplot(3,3,2)
sns.boxplot(concrete['slag'])
plt.subplot(3,3,3)
sns.boxplot(concrete['ash'])
plt.subplot(3,3,4)
sns.boxplot(concrete['water'])
plt.subplot(3,3,5)
sns.boxplot(concrete['superplastic'])
plt.subplot(3,3,6)
sns.boxplot(concrete['coarseagg'])
plt.subplot(3,3,7)
sns.boxplot(concrete['fineagg'])
plt.subplot(3,3,8)
sns.boxplot(concrete['age'])
plt.subplot(3,3,9)
sns.boxplot(concrete['strength'])
#For now ignoring all outlier points as we dont have subject knowledge of concrete mixes.
sns.distplot(concrete['age'])
#Max data points is for 28 days concrete age
concrete['age'].value_counts().sort_index()
sns.pairplot(concrete,diag_kind='kde')
#Cement and strength seem to have a positive relation, superplastic and water have a -ive relation. Rest have
#weak relation with each other. The same could be seen in heatmap below.
#Also, as age of concrete strength increases, in graph we can see low strength value is not there as age increases.
corr=concrete.corr()
plt.figure(figsize=(10,6))
sns.heatmap(corr,annot=True,cmap='YlGnBu')
#As per above, binning age to observe data
def bal_group(series):
if 1 <= series <= 15:
return "1. 1-15 days"
elif 16 <= series <= 30:
return "2. 16-30 days"
elif 31 <= series <= 90:
return "3. 1-3 months"
elif 91 <= series:
return "4. 3 months+"
concrete['age_group'] = concrete['age'].apply(bal_group)
#Also binning strength to observe data
def bal_group(series):
if 1 <= series <= 17:
return "1. Low"
elif 17 < series <= 28:
return "2. Medium"
elif 28 < series:
return "3. High"
concrete['strength_group'] = concrete['strength'].apply(bal_group)
print("Age Grouping")
print(concrete.age_group.value_counts().sort_index()/len(concrete.age_group)*100)
print("\nStrength Grouping")
print(concrete.strength_group.value_counts().sort_index()/len(concrete.strength_group)*100)
#cement and strength distribution
plt.figure(figsize=(4,4))
sns.jointplot(concrete['strength'],concrete['cement'],kind='kde',color='Red')
#slag and strength distribution
plt.figure(figsize=(4,4))
sns.jointplot(concrete['strength'],concrete['slag'],kind='kde',color='Red')
#ash and strength distribution
plt.figure(figsize=(4,4))
sns.jointplot(concrete['strength'],concrete['ash'],kind='kde',color='Red')
#water and strength distribution
plt.figure(figsize=(4,4))
sns.jointplot(concrete['strength'],concrete['water'],kind='kde',color='Red')
#superplastic and strength distribution
plt.figure(figsize=(4,4))
sns.jointplot(concrete['strength'],concrete['superplastic'],kind='kde',color='Red')
#coarseagg and strength distribution
plt.figure(figsize=(4,4))
sns.jointplot(concrete['strength'],concrete['coarseagg'],kind='kde',color='Red')
#fineagg and strength distribution
plt.figure(figsize=(4,4))
sns.jointplot(concrete['strength'],concrete['fineagg'],kind='kde',color='Red')
#age and strength distribution
plt.figure(figsize=(4,4))
sns.jointplot(concrete['strength'],concrete['age'],kind='kde',color='Red')
#We can see that most data for strength is concentrated around range 20-40
concrete.groupby(['strength_group']).agg(['median'])
concrete.groupby(['age_group']).agg(['median'])
concrete.groupby(['strength_group','age_group']).agg(['median'])
concrete.groupby(['strength_group','age_group']).agg(['count'])
plt.figure(figsize=(8,6))
sns.scatterplot(concrete['water'],concrete['strength'],hue=concrete['age_group'],palette='tab10_r')
#superpalstic is taken in low amount and there strength increases as time goes by.
plt.figure(figsize=(8,6))
sns.scatterplot(concrete['superplastic'],concrete['strength'],hue=concrete['age_group'],palette='tab10_r')
#superpalstic is taken in low amount and there strength increases as time goes by.
plt.figure(figsize=(8,6))
sns.scatterplot(concrete['superplastic'],concrete['water'],hue=concrete['age_group'],palette='tab10_r')
#superpalstic is taken in low amount and there strength increases as time goes by.
plt.figure(figsize=(8,6))
sns.scatterplot(concrete['superplastic'],concrete['water'],hue=concrete['strength_group'])
#replacing zeros with NaN's
concrete.replace(0,np.nan,inplace=True)
#checking for zeros now
print("Zeros now in data")
print(concrete[concrete==0].count(axis=0))
#checking missing now, need to look at them. First lets study features.
print("\nNow missing values in data")
print(concrete.isnull().sum())
#Filling NA as per group created for age and strength
con_col=['slag','ash','superplastic']
for item in con_col:
concrete[item].fillna(concrete.groupby(['strength_group','age_group'])[item].transform('median'),inplace=True)
#checking of blanks
concrete.isnull().sum()
concrete.describe()
#Filllinh of remaining blanks in data
con_col=['slag','ash','superplastic']
for item in con_col:
concrete[item].fillna(concrete[item].median(),inplace=True)
#Finally checking of blanks
concrete.isnull().sum()
concrete.columns
concrete.drop(['strength_group','age_group'],inplace=True,axis=1)
concrete.shape
#considering concerete as final data, now starting with analysis.
#Doing test train split
from sklearn.model_selection import train_test_split
concrete_train,concrete_test=train_test_split(concrete,train_size=0.7,random_state=100)
concrete_train.shape
#Doing Scaling of the variables
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
#To train dataset
concrete_train_scl=scaler.fit_transform(concrete_train)
#To test Dataset
concrete_test_scl=scaler.transform(concrete_test)
concrete_train_scl=pd.DataFrame(concrete_train_scl,columns=concrete_train.columns)
concrete_test_scl=pd.DataFrame(concrete_test_scl,columns=concrete_test.columns)
#X Y split of train data
X_concrete_train=concrete_train_scl.drop(['strength'],axis=1)
Y_concrete_train=concrete_train_scl['strength']
#X Y split of test data
X_concrete_test=concrete_test_scl.drop(['strength'],axis=1)
Y_concrete_test=concrete_test_scl['strength']
# Linear Regression
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
#Add constant
X_concrete_train_1=sm.add_constant(X_concrete_train)
lm1=sm.OLS(Y_concrete_train,X_concrete_train_1).fit()
print(lm1.summary())
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_concrete_train_1.columns
vif['VIF'] = [variance_inflation_factor(X_concrete_train_1.values, i) for i in range(X_concrete_train_1.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif
#dropping coarseagg, there is no significant change.
X_concrete_train_2 = X_concrete_train.drop(['slag'],axis=1)
#ITERATION 6
lm2=sm.OLS(Y_concrete_train,X_concrete_train_2).fit()
print(lm2.summary())
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_concrete_train_2.columns
vif['VIF'] = [variance_inflation_factor(X_concrete_train_2.values, i) for i in range(X_concrete_train_2.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif
#Trying RFE
# Import RFE
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
# RFE with 6 features
lm3 = LinearRegression()
rfe1 = RFE(lm3, 6)
# Fit with 6 features
rfe1.fit(X_concrete_train, Y_concrete_train)
# Print the boolean results
print(rfe1.support_)
print(rfe1.ranking_)
# Subset the features selected by rfe1
col1 = X_concrete_train.columns[rfe1.support_]
# Subsetting training data for 6 selected columns
X_train_rfe1 = X_concrete_train[col1]
# Add a constant to the model
X_train_rfe1 = sm.add_constant(X_train_rfe1)
X_train_rfe1.head()
# Fitting the model with 6 variables
lm4 = sm.OLS(Y_concrete_train, X_train_rfe1).fit()
print(lm4.summary())
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train_rfe1.columns
vif['VIF'] = [variance_inflation_factor(X_train_rfe1.values, i) for i in range(X_train_rfe1.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif
#Selecting first linear regression algo and doing residual analysis
#Error term is normally distributed
Y_concrete_train_predict = lm1.predict(X_concrete_train_1)
# Plot the histogram of the error terms
fig = plt.figure()
sns.distplot((Y_concrete_train - Y_concrete_train_predict), bins = 20)
fig.suptitle('Error Terms', fontsize = 20) # Plot heading
plt.xlabel('Errors', fontsize = 18) # X-label
#Making Predictions
# Add a constant to the test set created
X_test = sm.add_constant(X_concrete_test)
X_test.info()
# Making predictions
y_pred = lm1.predict(X_test)
# Plotting y_test and y_pred to understand the spread
fig = plt.figure()
plt.scatter(Y_concrete_test, y_pred)
fig.suptitle('y_test vs y_pred', fontsize = 20) # Plot heading
plt.xlabel('y_test', fontsize = 18) # X-label
plt.ylabel('y_pred', fontsize = 16)
# r2_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
print("R-square: ",r2_score(Y_concrete_test, y_pred))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_pred)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_pred))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_pred))
#Actual vs Predicted
c = [i for i in range(1,310,1)]
fig = plt.figure(figsize=(18,6))
plt.plot(c,Y_concrete_test, color="blue", linewidth=3.5, linestyle="-")
plt.plot(c,y_pred, color="red", linewidth=1.5, linestyle="-")
fig.suptitle('Actual and Predicted', fontsize=20) # Plot heading
plt.xlabel('Index', fontsize=10) # X-label
plt.ylabel('Values', fontsize=10) # Y-label
#model traverses through output well, however is not able to predict abrupt highs and lows.
#Ridge Regression
from sklearn.linear_model import Ridge
#define model
Ridge_model_1=Ridge()
#training model
Ridge_model_1.fit(X_concrete_train,Y_concrete_train)
#predicting result over test data
y_Ridge_pred_1 = Ridge_model_1.predict(X_concrete_test)
#Results
print("R2 score : " , r2_score(Y_concrete_test,y_Ridge_pred_1))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_Ridge_pred_1)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_Ridge_pred_1))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_Ridge_pred_1))
plt.figure(figsize=(18,6))
sns.lineplot(range(309),Y_concrete_test,c='b',linewidth=3.5)
sns.lineplot(range(309),y_Ridge_pred_1,c='r',linewidth=1.5)
#define model
Ridge_model_1=Ridge(alpha=0.01)
#training model
Ridge_model_1.fit(X_concrete_train,Y_concrete_train)
#predicting result over test data
y_Ridge_pred_1 = Ridge_model_1.predict(X_concrete_test)
#Results
print("R2 score : " , r2_score(Y_concrete_test,y_Ridge_pred_1))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_Ridge_pred_1)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_Ridge_pred_1))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_Ridge_pred_1))
plt.figure(figsize=(18,6))
sns.lineplot(range(309),Y_concrete_test,c='b',linewidth=3.5)
sns.lineplot(range(309),y_Ridge_pred_1,c='r',linewidth=1.5)
#Lasso Regression
from sklearn.linear_model import Lasso
#define model
Lasso_model_1=Lasso(alpha=0.001, max_iter=1000)
#training model
Lasso_model_1.fit(X_concrete_train,Y_concrete_train)
#predicting result over test data
y_Lasso_pred_1 = Lasso_model_1.predict(X_concrete_test)
#Results
print("R2 score : " , r2_score(Y_concrete_test,y_Lasso_pred_1))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_Lasso_pred_1)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_Lasso_pred_1))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_Lasso_pred_1))
plt.figure(figsize=(18,6))
sns.lineplot(range(309),Y_concrete_test,c='b',linewidth=3.5)
sns.lineplot(range(309),y_Lasso_pred_1,c='r',linewidth=1.5)
#Polynomial Regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('poly',PolynomialFeatures(degree=2)),('lr', LinearRegression())])
pipeline.fit(X_concrete_train, Y_concrete_train)
y_poly_pred = pipeline.predict(X_concrete_test)
print("R-square: ",r2_score(Y_concrete_test, y_poly_pred))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_poly_pred)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_poly_pred))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_poly_pred))
plt.figure(figsize=(18,6))
sns.lineplot(range(309),Y_concrete_test,c='b',linewidth=3.5)
sns.lineplot(range(309),y_poly_pred,c='r',linewidth=1.5)
plt.title('Actual and Predicted', fontsize=20) # Plot heading
plt.xlabel('Index', fontsize=10) # X-label
plt.ylabel('Values', fontsize=10) # Y-label
#Support Vector Regressor
from sklearn.svm import SVR
#model define
SVR_model_1 = SVR(kernel='rbf')
#training model
SVR_model_1.fit(X_concrete_train,Y_concrete_train)
#predicting result over test data
y_SVR_pred = SVR_model_1.predict(X_concrete_test)
#Results
print("R2 score : " , r2_score(Y_concrete_test,y_SVR_pred))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_SVR_pred)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_SVR_pred))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_SVR_pred))
plt.figure(figsize=(18,6))
sns.lineplot(range(309),Y_concrete_test,c='b',linewidth=3.5)
sns.lineplot(range(309),y_SVR_pred,c='r',linewidth=1.5)
plt.title('Actual and Predicted', fontsize=20) # Plot heading
plt.xlabel('Index', fontsize=10) # X-label
plt.ylabel('Values', fontsize=10) # Y-label
#model define
SVR_model_2 = SVR(gamma='scale',C=1.0, epsilon=0.2,kernel='rbf')
#training model
SVR_model_2.fit(X_concrete_train,Y_concrete_train)
#predicting result over test data
y_SVR_pred_2 = SVR_model_2.predict(X_concrete_test)
#Results
print("R2 score : " , r2_score(Y_concrete_test,y_SVR_pred_2))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_SVR_pred_2)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_SVR_pred_2))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_SVR_pred_2))
plt.figure(figsize=(18,6))
sns.lineplot(range(309),Y_concrete_test,c='b',linewidth=3.5)
sns.lineplot(range(309),y_SVR_pred_2,c='r',linewidth=1.5)
plt.title('Actual and Predicted', fontsize=20) # Plot heading
plt.xlabel('Index', fontsize=10) # X-label
plt.ylabel('Values', fontsize=10) # Y-label
#KNN Regressor
from sklearn.neighbors import KNeighborsRegressor
#define model
KNNR_model_1=KNeighborsRegressor()
#training model
KNNR_model_1.fit(X_concrete_train,Y_concrete_train)
#predicting result over test data
y_KNNR_pred_1 = KNNR_model_1.predict(X_concrete_test)
#Results
print("R2 score : " , r2_score(Y_concrete_test,y_KNNR_pred_1))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_KNNR_pred_1)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_KNNR_pred_1))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_KNNR_pred_1))
plt.figure(figsize=(18,6))
sns.lineplot(range(309),Y_concrete_test,c='b',linewidth=3.5)
sns.lineplot(range(309),y_KNNR_pred_1,c='r',linewidth=1.5)
plt.title('Actual and Predicted', fontsize=20) # Plot heading
plt.xlabel('Index', fontsize=10) # X-label
plt.ylabel('Values', fontsize=10) # Y-label
#define model
KNNR_model_2=KNeighborsRegressor(n_neighbors=5,weights='distance')
#training model
KNNR_model_2.fit(X_concrete_train,Y_concrete_train)
#predicting result over test data
y_KNNR_pred_2 = KNNR_model_2.predict(X_concrete_test)
#Results
print("R2 score : " , r2_score(Y_concrete_test,y_KNNR_pred_2))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_KNNR_pred_2)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_KNNR_pred_2))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_KNNR_pred_2))
plt.figure(figsize=(18,6))
sns.lineplot(range(309),Y_concrete_test,c='b',linewidth=3.5)
sns.lineplot(range(309),y_KNNR_pred_2,c='r',linewidth=1.5)
plt.title('Actual and Predicted', fontsize=20) # Plot heading
plt.xlabel('Index', fontsize=10) # X-label
plt.ylabel('Values', fontsize=10) # Y-label
#Decision Tree
from sklearn.tree import DecisionTreeRegressor
#define model
DT_model_1=DecisionTreeRegressor()
#training model
DT_model_1.fit(X_concrete_train,Y_concrete_train)
#predicting result over test data
y_DT_pred_1 = DT_model_1.predict(X_concrete_test)
#Results
print("R2 score : " , r2_score(Y_concrete_test,y_DT_pred_1))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_DT_pred_1)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_DT_pred_1))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_DT_pred_1))
plt.figure(figsize=(18,6))
sns.lineplot(range(309),Y_concrete_test,c='b',linewidth=3.5)
sns.lineplot(range(309),y_DT_pred_1,c='r',linewidth=1.5)
plt.title('Actual and Predicted', fontsize=20) # Plot heading
plt.xlabel('Index', fontsize=10) # X-label
plt.ylabel('Values', fontsize=10) # Y-label
#Feature Importance
dt_imp_feature_1=pd.DataFrame(DT_model_1.feature_importances_, columns = ["Imp"], index = X_concrete_test.columns)
dt_imp_feature_1.sort_values(by="Imp",ascending=False)
#define model
DT_model_2=DecisionTreeRegressor(max_depth=10,min_samples_leaf=5)
#training model
DT_model_2.fit(X_concrete_train,Y_concrete_train)
#predicting result over test data
y_DT_pred_2 = DT_model_2.predict(X_concrete_test)
#Results
print("R2 score : " , r2_score(Y_concrete_test,y_DT_pred_2))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_DT_pred_2)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_DT_pred_2))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_DT_pred_2))
plt.figure(figsize=(18,6))
sns.lineplot(range(309),Y_concrete_test,c='b',linewidth=3.5)
sns.lineplot(range(309),y_DT_pred_2,c='r',linewidth=1.5)
plt.title('Actual and Predicted', fontsize=20) # Plot heading
plt.xlabel('Index', fontsize=10) # X-label
plt.ylabel('Values', fontsize=10) # Y-label
#Feature Importance
dt_imp_feature_2=pd.DataFrame(DT_model_2.feature_importances_, columns = ["Imp"], index = X_concrete_test.columns)
dt_imp_feature_2.sort_values(by="Imp",ascending=False)
#Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
#define model
RF_model_1=RandomForestRegressor()
#training model
RF_model_1.fit(X_concrete_train,Y_concrete_train)
#predicting result over test data
y_RF_pred_1 = RF_model_1.predict(X_concrete_test)
#Results
print("R2 score : " , r2_score(Y_concrete_test,y_RF_pred_1))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_RF_pred_1)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_RF_pred_1))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_RF_pred_1))
plt.figure(figsize=(18,6))
sns.lineplot(range(309),Y_concrete_test,c='b',linewidth=3.5)
sns.lineplot(range(309),y_RF_pred_1,c='r',linewidth=1.5)
plt.title('Actual and Predicted', fontsize=20) # Plot heading
plt.xlabel('Index', fontsize=10) # X-label
plt.ylabel('Values', fontsize=10) # Y-label
#Feature Importance
dt_imp_feature_1=pd.DataFrame(RF_model_1.feature_importances_, columns = ["Imp"], index = X_concrete_test.columns)
dt_imp_feature_1.sort_values(by="Imp",ascending=False)
#define model
RF_model_2=RandomForestRegressor(n_estimators=20,max_depth=10,min_samples_leaf=8)
#training model
RF_model_2.fit(X_concrete_train,Y_concrete_train)
#predicting result over test data
y_RF_pred_2 = RF_model_2.predict(X_concrete_test)
#Results
print("R2 score : " , r2_score(Y_concrete_test,y_RF_pred_2))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_RF_pred_2)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_RF_pred_2))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_RF_pred_2))
plt.figure(figsize=(18,6))
sns.lineplot(range(309),Y_concrete_test,c='b',linewidth=3.5)
sns.lineplot(range(309),y_RF_pred_2,c='r',linewidth=1.5)
plt.title('Actual and Predicted', fontsize=20) # Plot heading
plt.xlabel('Index', fontsize=10) # X-label
plt.ylabel('Values', fontsize=10) # Y-label
#Feature Importance
dt_imp_feature_2=pd.DataFrame(RF_model_2.feature_importances_, columns = ["Imp"], index = X_concrete_test.columns)
dt_imp_feature_2.sort_values(by="Imp",ascending=False)
#Gradient Boost Regressor
from sklearn.ensemble import GradientBoostingRegressor
#define model
GB_model_1=GradientBoostingRegressor()
#training model
GB_model_1.fit(X_concrete_train,Y_concrete_train)
#predicting result over test data
y_GB_pred_1 = GB_model_1.predict(X_concrete_test)
#Results
print("R2 score : " , r2_score(Y_concrete_test,y_GB_pred_1))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_GB_pred_1)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_GB_pred_1))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_GB_pred_1))
plt.figure(figsize=(18,6))
sns.lineplot(range(309),Y_concrete_test,c='b',linewidth=3.5)
sns.lineplot(range(309),y_GB_pred_1,c='r',linewidth=1.5)
plt.title('Actual and Predicted', fontsize=20) # Plot heading
plt.xlabel('Index', fontsize=10) # X-label
plt.ylabel('Values', fontsize=10) # Y-label
#Feature Importance
dt_imp_feature_1=pd.DataFrame(GB_model_1.feature_importances_, columns = ["Imp"], index = X_concrete_test.columns)
dt_imp_feature_1.sort_values(by="Imp",ascending=False)
#define model
GB_model_2=GradientBoostingRegressor(n_estimators=150,max_depth=5)
#training model
GB_model_2.fit(X_concrete_train,Y_concrete_train)
#predicting result over test data
y_GB_pred_2 = GB_model_2.predict(X_concrete_test)
#Results
print("R2 score : " , r2_score(Y_concrete_test,y_GB_pred_2))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_GB_pred_2)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_GB_pred_2))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_GB_pred_2))
plt.figure(figsize=(18,6))
sns.lineplot(range(309),Y_concrete_test,c='b',linewidth=3.5)
sns.lineplot(range(309),y_GB_pred_2,c='r',linewidth=1.5)
plt.title('Actual and Predicted', fontsize=20) # Plot heading
plt.xlabel('Index', fontsize=10) # X-label
plt.ylabel('Values', fontsize=10) # Y-label
#Feature Importance
dt_imp_feature_1=pd.DataFrame(GB_model_2.feature_importances_, columns = ["Imp"], index = X_concrete_test.columns)
dt_imp_feature_1.sort_values(by="Imp",ascending=False)
from xgboost.sklearn import XGBRegressor
#define model
XGB_model_1=XGBRegressor(objective='reg:squarederror')
#training model
XGB_model_1.fit(X_concrete_train,Y_concrete_train)
#predicting result over test data
y_XGB_pred_1 = XGB_model_1.predict(X_concrete_test)
#Results
print("R2 score : " , r2_score(Y_concrete_test,y_XGB_pred_1))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_XGB_pred_1)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_XGB_pred_1))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_XGB_pred_1))
plt.figure(figsize=(18,6))
sns.lineplot(range(309),Y_concrete_test,c='b',linewidth=3.5)
sns.lineplot(range(309),y_XGB_pred_1,c='r',linewidth=1.5)
plt.title('Actual and Predicted', fontsize=20) # Plot heading
plt.xlabel('Index', fontsize=10) # X-label
plt.ylabel('Values', fontsize=10) # Y-label
#Feature Importance
dt_imp_feature_1=pd.DataFrame(XGB_model_1.feature_importances_, columns = ["Imp"], index = X_concrete_test.columns)
dt_imp_feature_1.sort_values(by="Imp",ascending=False)
#define model
XGB_model_2=XGBRegressor(n_estimators=150,max_depth=5)
#training model
XGB_model_2.fit(X_concrete_train,Y_concrete_train)
#predicting result over test data
y_XGB_pred_2 = XGB_model_2.predict(X_concrete_test)
#Results
print("R2 score : " , r2_score(Y_concrete_test,y_XGB_pred_2))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_XGB_pred_2)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_XGB_pred_2))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_XGB_pred_2))
plt.figure(figsize=(18,6))
sns.lineplot(range(309),Y_concrete_test,c='b',linewidth=3.5)
sns.lineplot(range(309),y_XGB_pred_2,c='r',linewidth=1.5)
plt.title('Actual and Predicted', fontsize=20) # Plot heading
plt.xlabel('Index', fontsize=10) # X-label
plt.ylabel('Values', fontsize=10) # Y-label
#Feature Importance
dt_imp_feature_2=pd.DataFrame(XGB_model_2.feature_importances_, columns = ["Imp"], index = X_concrete_test.columns)
dt_imp_feature_2.sort_values(by="Imp",ascending=False)
#dropping coarseagg and slag as both had very low importance in ensemble technique
#X_concrete_train_3 = X_concrete_train.drop(['coarseagg','slag'],axis=1)
#X_concrete_test_3 = X_concrete_test.drop(['coarseagg','slag'],axis=1)
#print(X_concrete_train_3.shape)
#print(X_concrete_test_3.shape)
#Doing Hyperparameter tuning to Random Forest, Gradient Boost and XGBoost
#Tunning of Random Forest
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
skf = KFold(n_splits=5, random_state=1)
RF_model_3 = RandomForestRegressor()
params = {"n_estimators": np.arange(10,16,2),"max_depth": np.arange(5, 9,1),
"max_features":np.arange(5,8,1),'min_samples_leaf': range(4, 12, 2),
'min_samples_split': range(20, 26, 2)}
RF_GV_1 = GridSearchCV(estimator = RF_model_3, param_grid = params,cv=skf,verbose=1,return_train_score=True)
RF_GV_1.fit(X_concrete_train,Y_concrete_train)
# results of grid search CV
RF_results = pd.DataFrame(RF_GV_1.cv_results_)
#parameters best value
best_score_rf = RF_GV_1.best_score_
best_rf = RF_GV_1.best_params_
best_rf
rf_best = RandomForestRegressor(max_depth= 8, max_features= 6,n_estimators=12,min_samples_leaf=8,min_samples_split=20)
rf_best.fit(X_concrete_train, Y_concrete_train)
# predict
y_pred_rf_1 = rf_best.predict(X_concrete_test)
#Results
print("R2 score : " , r2_score(Y_concrete_test,y_pred_rf_1))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_pred_rf_1)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_pred_rf_1))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_pred_rf_1))
plt.figure(figsize=(18,6))
sns.lineplot(range(309),Y_concrete_test,c='b',linewidth=3.5)
sns.lineplot(range(309),y_pred_rf_1,c='r',linewidth=1.5)
plt.title('Actual and Predicted', fontsize=20) # Plot heading
plt.xlabel('Index', fontsize=10) # X-label
plt.ylabel('Values', fontsize=10) # Y-label
#Feature Importance
dt_imp_feature_2=pd.DataFrame(rf_best.feature_importances_, columns = ["Imp"], index = X_concrete_test.columns)
dt_imp_feature_2.sort_values(by="Imp",ascending=False)
#Tuning of Gradient Boosting
GB_model_3=GradientBoostingRegressor()
params = {"n_estimators": [130,140,2],"learning_rate":[0.09,0.1],"max_depth": np.arange(8, 14,2),
"max_features":np.arange(3,8,1),'min_samples_leaf': range(14, 20, 2)}
GB_GV_1 = GridSearchCV(estimator = GB_model_3, param_grid = params,cv=skf,verbose=1,return_train_score=True)
GB_GV_1.fit(X_concrete_train,Y_concrete_train)
# results of grid search CV
GB_results = pd.DataFrame(GB_GV_1.cv_results_)
#parameters best value
best_score_rf = GB_GV_1.best_score_
best_gb = GB_GV_1.best_params_
best_gb
gb_best = GradientBoostingRegressor(learning_rate= 0.1, n_estimators= 140,max_depth= 10,
max_features= 5,min_samples_leaf=12)
gb_best.fit(X_concrete_train, Y_concrete_train)
# predict
y_pred_gb_1 = gb_best.predict(X_concrete_test)
#Results
print("R2 score : " , r2_score(Y_concrete_test,y_pred_gb_1))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_pred_gb_1)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_pred_gb_1))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_pred_gb_1))
plt.figure(figsize=(18,6))
sns.lineplot(range(309),Y_concrete_test,c='b',linewidth=3.5)
sns.lineplot(range(309),y_pred_gb_1,c='r',linewidth=1.5)
plt.title('Actual and Predicted', fontsize=20) # Plot heading
plt.xlabel('Index', fontsize=10) # X-label
plt.ylabel('Values', fontsize=10) # Y-label
#Feature Importance
dt_imp_feature_2=pd.DataFrame(gb_best.feature_importances_, columns = ["Imp"], index = X_concrete_test.columns)
dt_imp_feature_2.sort_values(by="Imp",ascending=False)
#Tuning of XGBoost
#Regularization using GridSearchCV - 1st Iteration
XGB_model_3=XGBRegressor(objective='reg:squarederror')
params1 = {
"colsample_bytree": [i/100.0 for i in range(48,54,1)],
"learning_rate": [0.2,0.22,0.24],
"n_estimators": [174,184,2],
"subsample": [i/100.0 for i in range(74,78,1)]
}
XGB_GV_3 = GridSearchCV(estimator = XGB_model_3, param_grid = params1,
cv=skf,
verbose = 1,
return_train_score=True)
XGB_GV_3.fit(X_concrete_train,Y_concrete_train)
# results of grid search CV
XGB_results_3 = pd.DataFrame(XGB_GV_3.cv_results_)
#parameters best value
best_score_xgb_3 = XGB_GV_3.best_score_
best_xgb_3 = XGB_GV_3.best_params_
best_xgb_3
#Choosing best parameter from 1st Iteration
xgb_best_3 = XGBRegressor(colsample_bytree=0.5,learning_rate=0.22,n_estimators=180,subsample=0.75,objective='reg:squarederror')
xgb_best_3.fit(X_concrete_train, Y_concrete_train)
# predict
y_pred_xgb_3 = xgb_best_3.predict(X_concrete_test)
#Results
print("R2 score : " , r2_score(Y_concrete_test,y_pred_xgb_3))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_pred_xgb_3)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_pred_xgb_3))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_pred_xgb_3))
plt.figure(figsize=(18,6))
sns.lineplot(range(309),Y_concrete_test,c='b',linewidth=3.5)
sns.lineplot(range(309),y_pred_xgb_3,c='r',linewidth=1.5)
plt.title('Actual and Predicted', fontsize=20) # Plot heading
plt.xlabel('Index', fontsize=10) # X-label
plt.ylabel('Values', fontsize=10) # Y-label
#Feature Importance
xgb_imp_feature_3=pd.DataFrame(xgb_best_3.feature_importances_, columns = ["Imp"], index = X_concrete_test.columns)
xgb_imp_feature_3.sort_values(by="Imp",ascending=False)
#Regularization using GridSearchCV - 2nd Iteration
params2 = {
'min_child_weight':[4,5,6,7],"max_depth": [2,4,6],
}
xgb_best_4 = GridSearchCV(estimator = xgb_best_3, param_grid = params2,
cv=skf,
verbose = 1,
return_train_score=True)
xgb_best_4.fit(X_concrete_train, Y_concrete_train)
# results of grid search CV
XGB_results_4 = pd.DataFrame(xgb_best_4.cv_results_)
XGB_results_4
#parameters best value
best_score_xgb_4 = xgb_best_4.best_score_
best_xgb_4 = xgb_best_4.best_params_
best_xgb_4
#Choosing best parameter from 2nd Iteration
xgb_best_4 = XGBRegressor(colsample_bytree=0.5,learning_rate=0.22,n_estimators=180,subsample=0.75,objective='reg:squarederror',
min_child_weight=5,max_depth=4)
xgb_best_4.fit(X_concrete_train, Y_concrete_train)
# predict
y_pred_xgb_4 = xgb_best_4.predict(X_concrete_test)
#Results
print("R2 score : " , r2_score(Y_concrete_test,y_pred_xgb_4))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_pred_xgb_4)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_pred_xgb_4))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_pred_xgb_4))
plt.figure(figsize=(18,6))
sns.lineplot(range(309),Y_concrete_test,c='b',linewidth=3.5)
sns.lineplot(range(309),y_pred_xgb_4,c='r',linewidth=1.5)
plt.title('Actual and Predicted', fontsize=20) # Plot heading
plt.xlabel('Index', fontsize=10) # X-label
plt.ylabel('Values', fontsize=10) # Y-label
#Feature Importance
xgb_imp_feature_4=pd.DataFrame(xgb_best_4.feature_importances_, columns = ["Imp"], index = X_concrete_test.columns)
xgb_imp_feature_4.sort_values(by="Imp",ascending=False)
#Regularization using GridSearchCV - 3rd Iteration
params3 = {
'gamma':[i/1000.0 for i in range(1,20,2)]
}
xgb_best_5 = GridSearchCV(estimator = xgb_best_4, param_grid = params3,
cv=skf,
verbose = 1,
return_train_score=True)
xgb_best_5.fit(X_concrete_train, Y_concrete_train)
# results of grid search CV
XGB_results_5 = pd.DataFrame(xgb_best_5.cv_results_)
XGB_results_5
#parameters best value
best_score_xgb_5 = xgb_best_5.best_score_
best_xgb_5 = xgb_best_5.best_params_
best_xgb_5
#Choosing best parameter from 3rd Iteration
xgb_best_5 = XGBRegressor(colsample_bytree=0.5,learning_rate=0.22,n_estimators=180,subsample=0.75,objective='reg:squarederror',
min_child_weight=5,max_depth=4,gamma=0.001)
xgb_best_5.fit(X_concrete_train, Y_concrete_train)
# predict
y_pred_xgb_5 = xgb_best_5.predict(X_concrete_test)
#Results
print("R2 score : " , r2_score(Y_concrete_test,y_pred_xgb_5))
print("Root Mean square error: ",np.sqrt(mean_squared_error(Y_concrete_test, y_pred_xgb_5)))
print("Mean square error: ",mean_squared_error(Y_concrete_test, y_pred_xgb_5))
print("Mean absolute error: ",mean_absolute_error(Y_concrete_test, y_pred_xgb_5))
plt.figure(figsize=(18,6))
sns.lineplot(range(309),Y_concrete_test,c='b',linewidth=3.5)
sns.lineplot(range(309),y_pred_xgb_5,c='r',linewidth=1.5)
plt.title('Actual and Predicted', fontsize=20) # Plot heading
plt.xlabel('Index', fontsize=10) # X-label
plt.ylabel('Values', fontsize=10) # Y-label
#Feature Importance
xgb_imp_feature_5=pd.DataFrame(xgb_best_5.feature_importances_, columns = ["Imp"], index = X_concrete_test.columns)
xgb_imp_feature_5.sort_values(by="Imp",ascending=False)
#Starting with R2 of .59 in Linear Regression we have reached to value of .92 in XGBoost (tuned with various hyperparameter)
#This is very good model can be seen from above graph. Also, mean and absolute are very low now.
#Model seems aligning to the various features. We try group age and see what happens.
plt.figure(figsize=(15,20))
plt.subplot(4,2,1)
plt.scatter(X_concrete_test.cement,Y_concrete_test,s=8,c='g')
sns.lineplot(X_concrete_test.cement, y_pred_xgb_5,c='r')
plt.subplot(4,2,2)
plt.scatter(X_concrete_test.slag,Y_concrete_test,s=8,c='g')
sns.lineplot(X_concrete_test.slag, y_pred_xgb_5,c='r')
plt.subplot(4,2,3)
plt.scatter(X_concrete_test.ash,Y_concrete_test,s=8,c='g')
sns.lineplot(X_concrete_test.ash, y_pred_xgb_5,c='r')
plt.subplot(4,2,4)
plt.scatter(X_concrete_test.water,Y_concrete_test,s=8,c='g')
sns.lineplot(X_concrete_test.water, y_pred_xgb_5,c='r')
plt.subplot(4,2,5)
plt.scatter(X_concrete_test.superplastic,Y_concrete_test,s=8,c='g')
sns.lineplot(X_concrete_test.superplastic, y_pred_xgb_5,c='r')
plt.subplot(4,2,6)
plt.scatter(X_concrete_test.coarseagg,Y_concrete_test,s=8,c='g')
sns.lineplot(X_concrete_test.coarseagg, y_pred_xgb_5,c='r')
plt.subplot(4,2,7)
plt.scatter(X_concrete_test.fineagg,Y_concrete_test,s=8,c='g')
sns.lineplot(X_concrete_test.fineagg, y_pred_xgb_5,c='r')
plt.subplot(4,2,8)
plt.scatter(X_concrete_test.age,Y_concrete_test,s=8,c='g')
sns.lineplot(X_concrete_test.age, y_pred_xgb_5,c='r')